import os
import openai
import time
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, JSONLoader

# 加载数据集
loader = DirectoryLoader(
    path="planning/data/corpus/BioEng",
    glob="*.json",
    loader_cls=JSONLoader,
    show_progress=True,
    loader_kwargs={
        "jq_schema": '"Id: " + (.id | tostring) + " Title: " + .title + " Description: " + .ai_generated_description',
    }
)

# 加载文档
docs = loader.load()
print("Number of documents:", len(docs))
print(docs[0].page_content)

# 分割文档为小块
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(docs)
print("Number of document blocks:", len(splits))

# 加载 API_KEY 到环境变量
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# 嵌入模型 text-embedding-ada-002
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
persist_directory = "planning/data/BioEng_sampled_vectorstore/"

t1 = time.time()

# 创建 Chroma 向量数据库
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=embeddings_model, 
    persist_directory=persist_directory
)

# 持久化
vectorstore.persist()

print('Number of vectors:', vectorstore._collection.count())
print('Vector storage completed!')

t2=time.time()
print(f'Embedding and store take time: {t2-t1} sec.')